Data loading and cleaning

It turns out that some observations contain string “#DIV/0!” and correspondent covariates were coverted to factors, so we have to treat “#DIV/0!” as NA.

data = read.csv('pml-training.csv', na.strings = c('#DIV/0!', 'NA'))
test = read.csv('pml-testing.csv', na.strings = c('#DIV/0!', 'NA'))
# summary(data)

# clean completely NA vars
col.ratio.na = sapply(data, function(x){sum(is.na(x))/length(x)})
data = data[col.ratio.na==0]
test = test[col.ratio.na==0]
names(data)
##  [1] "X"                    "user_name"            "raw_timestamp_part_1"
##  [4] "raw_timestamp_part_2" "cvtd_timestamp"       "new_window"          
##  [7] "num_window"           "roll_belt"            "pitch_belt"          
## [10] "yaw_belt"             "total_accel_belt"     "gyros_belt_x"        
## [13] "gyros_belt_y"         "gyros_belt_z"         "accel_belt_x"        
## [16] "accel_belt_y"         "accel_belt_z"         "magnet_belt_x"       
## [19] "magnet_belt_y"        "magnet_belt_z"        "roll_arm"            
## [22] "pitch_arm"            "yaw_arm"              "total_accel_arm"     
## [25] "gyros_arm_x"          "gyros_arm_y"          "gyros_arm_z"         
## [28] "accel_arm_x"          "accel_arm_y"          "accel_arm_z"         
## [31] "magnet_arm_x"         "magnet_arm_y"         "magnet_arm_z"        
## [34] "roll_dumbbell"        "pitch_dumbbell"       "yaw_dumbbell"        
## [37] "total_accel_dumbbell" "gyros_dumbbell_x"     "gyros_dumbbell_y"    
## [40] "gyros_dumbbell_z"     "accel_dumbbell_x"     "accel_dumbbell_y"    
## [43] "accel_dumbbell_z"     "magnet_dumbbell_x"    "magnet_dumbbell_y"   
## [46] "magnet_dumbbell_z"    "roll_forearm"         "pitch_forearm"       
## [49] "yaw_forearm"          "total_accel_forearm"  "gyros_forearm_x"     
## [52] "gyros_forearm_y"      "gyros_forearm_z"      "accel_forearm_x"     
## [55] "accel_forearm_y"      "accel_forearm_z"      "magnet_forearm_x"    
## [58] "magnet_forearm_y"     "magnet_forearm_z"     "classe"

Ok, looks like all the data (variance, kutorsis, etc..) that was calculated with sliding window method is doomed..

The Revelation

In fact, I have a strong feeling, that variable classe can be accurately predicted by means of timestamp. The dataset has a strong structure. In paper the researchers said that they had asked participants to perform the excercise in a specific way (classe) and record the data afterwards.

If test data was sampled randomly from the initial dataset, then classe of test samples can be recovered from the train observations, that were recorded in the same time…

pred_naive = c()
for (i in seq(1, nrow(test))){
  temp = data[ data$user_name == test$user_name[i] &
               data$raw_timestamp_part_1 == test$raw_timestamp_part_1[i], 
               c('raw_timestamp_part_2', 'classe')]
  delta = abs(temp$raw_timestamp_part_2 - test$raw_timestamp_part_2[i])
  # print(min(delta))
  pred_naive = append(pred_naive, as.character(temp$classe[which.min(delta)]))
}
pred = data.frame(problem_id = test$problem_id, classe = pred_naive)
pred
##    problem_id classe
## 1           1      B
## 2           2      A
## 3           3      B
## 4           4      A
## 5           5      A
## 6           6      E
## 7           7      D
## 8           8      B
## 9           9      A
## 10         10      A
## 11         11      B
## 12         12      C
## 13         13      B
## 14         14      A
## 15         15      E
## 16         16      E
## 17         17      A
## 18         18      B
## 19         19      B
## 20         20      B

Some times differences can be quite large, so this strategy could be wrong. Anyway, lets assume we are not aware of this ‘’trick’’

Timestamp exploration

# sorting..
data = data[order(data$user_name, data$raw_timestamp_part_1, data$raw_timestamp_part_2),]
test = test[order(test$user_name, test$raw_timestamp_part_1, test$raw_timestamp_part_2),]

# data[60:90, c('raw_timestamp_part_1','raw_timestamp_part_2', 
#               'num_window', 'new_window', 'cvtd_timestamp')]

Accoriding to this resource raw_timestamp_part_1 has precision up to seconds. raw_timestamp_part_2 is probably for tinyer time measures. num_window is probably Id or windows. Not clear what new_window is.

Windows have different length.

table(data$new_window, data$num_window)

Measurements exploration

It seems to me, that the name of the participat is very usefull covatiate, since different people have different characteristics of movements. Let’s see if all participants have approximately the same number of observations.

addmargins(table(data$user_name, data$classe))
##           
##                A     B     C     D     E   Sum
##   adelmo    1165   776   750   515   686  3892
##   carlitos   834   690   493   486   609  3112
##   charles    899   745   539   642   711  3536
##   eurico     865   592   489   582   542  3070
##   jeremy    1177   489   652   522   562  3402
##   pedro      640   505   499   469   497  2610
##   Sum       5580  3797  3422  3216  3607 19622

Ok, so we don’t have something odd with number of observations per participant, but classes are a bit disbalanced.

Distibutions

Lets see how our measurements are spread with respect to user_name and classe values

library(ggplot2)
bp = ggplot(data, aes(y=roll_belt, x=classe)) +
  geom_boxplot(aes(fill=user_name)) +
  facet_grid(.~user_name)
print(bp)

nm = colnames(data)
for (i in seq(8,59)){
    
  bp = ggplot(data, aes_string(y=nm[i], x='classe')) +
    geom_boxplot(aes(fill=user_name)) +
    facet_grid(.~user_name)
  
  print(bp)
  }

Ok, at least we can identify some outliers..

Considering time domain

Another option to look at the data - consider time domain. It turns out, that variable X is responsible for timing. But it is not that good for visualizaion purposes…

data$time = NA

for (n in unique(data$user_name)){
  idx = data$user_name == n
  data$time[idx] = seq(1, sum(idx))
}

p = ggplot(data, aes(y=total_accel_belt, x=time)) +
  geom_point(aes(color=classe)) +
  facet_grid(.~user_name, scales='free', space='free')
print(p)

Let’s look at other covariates:

nm = colnames(data)
for (i in seq(8,59)){
    
  p = ggplot(data, aes_string(y=nm[i], x='time')) +
    geom_point(aes(color=classe)) +
    facet_grid(.~user_name, scales='free', space='free')
  
  print(p)
  }

Remove outliers

This a naked eye we can indicate some outliers. Lets wipe them out

idx = data$gyros_dumbbell_x < -50 | data$gyros_dumbbell_y > 20 | data$gyros_dumbbell_z >100 |
  data$accel_dumbbell_x < -200 | 
  (data$accel_dumbbell_y > 100 & data$user_name == 'eurico' & data$classe == 'A') |  
  (data$accel_dumbbell_z > 200 & data$user_name == 'eurico' & data$classe == 'A') | 
  (data$accel_dumbbell_z < -200 & data$user_name == 'carlitos' & data$classe == 'A') |
  data$magnet_dumbbell_y < -1000 | data$total_accel_forearm > 90 | data$gyros_forearm_x < -5 |
  data$gyros_forearm_y > 100 | data$gyros_forearm_z > 100 |
  data$accel_forearm_y > 500 | 
  (data$accel_forearm_z < 0 & data$user_name == 'eurico' & data$classe == 'A') |
  (data$accel_forearm_x < 100 & data$user_name == 'eurico' & data$classe == 'A') |
  (data$accel_forearm_x < 100 & data$user_name == 'carlitos' & data$classe == 'A') |
  data$yaw_belt < -100 | data$gyros_belt_x > 1 | 
  (data$magnet_belt_x > 300 & data$user_name == 'adelmo' & data$classe == 'E') |
  (data$magnet_belt_x > 150 & data$user_name == 'eurico' & data$classe == 'E') |
  (data$magnet_belt_z > -375 & data$user_name == 'eurico' & data$classe == 'E') |
  (data$total_accel_dumbbell >20 & data$user_name == 'carlitos' & data$classe == 'A') |
  (data$total_accel_dumbbell >20 & data$user_name == 'eurico' & data$classe == 'A')

print(sum(idx))
## [1] 1409
data = data[!idx,]

# Also remove the following columns:
cols = c('roll_forearm', 'pitch_forearm', 'yaw_forearm', 'roll_arm', 'pitch_arm', 'yaw_arm')
data = data[setdiff(names(data), cols)]

Transformation

At first we will Normalize the data, since it has rather different scales, and perform PCA afterwards

library(caret)
## Loading required package: lattice
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
library(doMC) # to run in parallel
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
preProc = preProcess(x = data[,8:53], 
                         method = c('center', 'scale', 'pca'),
                         thresh = 0.9)

data_pca = predict(preProc, data)
test_pca = predict(preProc, test)

Let’s see..

p = ggplot(data_pca, aes(y=PC1, x=PC2)) +
  geom_point(aes(color=user_name)) +
  facet_grid(.~classe)
print(p)

for (n in unique(data$user_name)){
  df_temp = predict(preProc, data[data$user_name==n,])
  
  p = ggplot(df_temp, aes(y=PC1, x=PC2)) +
    geom_point(aes(color=classe)) +
    ggtitle(n)
  print(p)
  }

Ok, we could distinguish between participants rather then between classes.. But this is not our task)

How about we make unique preprocessing for each participant:

preProcUsers = list()
for (n in unique(data$user_name)){
  preProc = preProcess(x = data[data$user_name == n, 8:53], 
                         method = c('center', 'scale', 'pca'),
                         thresh = 0.9)
  preProcUsers[[n]] = preProc
}

Seems a bit better to me..

for (n in unique(data$user_name)){
  df_temp = predict(preProcUsers[[n]], data[data$user_name==n,])
  
  p = ggplot(df_temp, aes(y=PC1, x=PC2)) +
    geom_point(aes(color=classe)) +
    ggtitle(n)
  print(p)
  }

However, using that method is incorrect since, for instance, PC1 has different meaning for each user..

Learning

Let’s recap what we have:

addmargins(table(data_pca$user_name, data_pca$classe))
##           
##                A     B     C     D     E   Sum
##   adelmo    1142   693   719   505   598  3657
##   carlitos   317   689   493   486   609  2594
##   charles    899   745   539   642   711  3536
##   eurico     268   592   489   582   490  2421
##   jeremy    1177   488   652   522   560  3399
##   pedro      640   505   499   469   493  2606
##   Sum       4443  3712  3391  3206  3461 18213
# Leave only relevant columns
cols = c('X', 'raw_timestamp_part_1', 'raw_timestamp_part_2', 'cvtd_timestamp', 'new_window', 'num_window', 'time')

data_pca = data_pca[, setdiff(names(data_pca), cols)]
test_pca = test_pca[, setdiff(names(test_pca), cols)]

We already have test sample. For quality estimations we will use CV.

fitControl = trainControl(method = 'repeatedcv', 
                          number = 5,
                          repeats = 5,
                          classProbs = T)

Model Fit

set.seed(123)
registerDoMC(4) # to run in parallel

if(file.exists('rfFit.RData')) {
  ## load model
  load('rfFit.RData')
} else {
  ## (re)fit the model
  rfFit = train(classe~., data = data_pca,
            allowParallel=TRUE,
            method = 'rf',
            metric = 'Accuracy',
            trControl = fitControl,
            verbose = FALSE)
  # It could take a while, so lets save it
  save(rfFit, file = 'rfFit.RData')
}

The model

print(rfFit)
## Random Forest 
## 
## 18213 samples
##    16 predictors
##     5 classes: 'A', 'B', 'C', 'D', 'E' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 5 times) 
## Summary of sample sizes: 14570, 14570, 14572, 14569, 14571, 14569, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa      Accuracy SD  Kappa SD   
##    2    0.9548344  0.9433299  0.003151888  0.003955084
##   11    0.9594245  0.9490975  0.003607349  0.004526450
##   20    0.9453575  0.9314449  0.004250353  0.005335055
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was mtry = 11.

Feature Importances

varImpPlot(rfFit$finalModel)

CV Results

print(rfFit$finalModel)
## 
## Call:
##  randomForest(x = x, y = y, mtry = param$mtry, allowParallel = TRUE,      verbose = FALSE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 11
## 
##         OOB estimate of  error rate: 3.3%
## Confusion matrix:
##      A    B    C    D    E class.error
## A 4347   19   40   27   10  0.02160702
## B   58 3558   83    3   10  0.04148707
## C   36   48 3270   33    4  0.03568269
## D   22    3  114 3056   11  0.04678727
## E    3   24   34   19 3381  0.02311471

Prediction

pred_cl = predict(rfFit, test_pca)
pred_rf = data.frame(problem_id = test_pca$problem_id, 
                     classe = pred_cl )
pred_rf
##    problem_id classe
## 1           4      A
## 2           9      A
## 3          11      B
## 4          18      B
## 5          10      A
## 6           5      A
## 7          20      B
## 8          13      B
## 9          16      E
## 10         14      A
## 11          2      A
## 12          3      C
## 13          8      B
## 14         12      C
## 15          7      D
## 16          6      E
## 17         15      E
## 18         17      A
## 19         19      B
## 20          1      B